# Computations
import numpy as np
import pandas as pd
import numba
# Tools
import os
import datetime
import itertools
import pickle
from scipy import stats
# sklearn
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import StackingRegressor, GradientBoostingRegressor, RandomForestRegressor,\
BaggingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.compose import make_column_transformer
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import set_config
set_config(display='diagram')
# Visualisation libraries
## progressbar
import progressbar
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In this article, we use a dataset from Kaggle.com.
Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling or the proximity to an east-west railroad. But this playground competition's dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.
With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.
Here's a brief version of what you'll find in the data description file.
| Feature | Description | Feature | Description |
|---|---|---|---|
| SalePrice | the property's sale price in dollars. | HeatingQC | Heating quality and condition |
| MSSubClass | The building class | CentralAir | Central air conditioning |
| MSZoning | The general zoning classification | Electrical | Electrical system |
| LotFrontage | Linear feet of street connected to property | 1stFlrSF | First Floor square feet |
| LotArea | Lot size in square feet | 2ndFlrSF | Second floor square feet |
| Street | Type of road access | LowQualFinSF | Low quality finished square feet (all floors) |
| Alley | Type of alley access | GrLivArea | Above grade (ground) living area square feet |
| LotShape | General shape of property | BsmtFullBath | Basement full bathrooms |
| LandContour | Flatness of the property | BsmtHalfBath | Basement half bathrooms |
| Utilities | Type of utilities available | FullBath | Full bathrooms above grade |
| LotConfig | Lot configuration | HalfBath | Half baths above grade |
| LandSlope | Slope of property | Bedroom | Number of bedrooms above basement level |
| Neighborhood | Physical locations within Ames city limits | Kitchen | Number of kitchens |
| Condition1 | Proximity to main road or railroad | KitchenQual | Kitchen quality |
| Condition2 | Proximity to main road or railroad (if a second is present) | TotRmsAbvGrd | Total rooms above grade (does not include bathrooms) |
| BldgType | Type of dwelling | Functional | Home functionality rating |
| HouseStyle | Style of dwelling | Fireplaces | Number of fireplaces |
| OverallQual | Overall material and finish quality | FireplaceQu | Fireplace quality |
| OverallCond | Overall condition rating | GarageType | Garage location |
| YearBuilt | Original construction date | GarageYrBlt | Year garage was built |
| YearRemodAdd | Remodel date | GarageFinish | Interior finish of the garage |
| RoofStyle | Type of roof | GarageCars | Size of garage in car capacity |
| RoofMatl | Roof material | GarageArea | Size of garage in square feet |
| Exterior1st | Exterior covering on house | GarageQual | Garage quality |
| Exterior2nd | Exterior covering on house (if more than one material) | GarageCond | Garage condition |
| MasVnrType | Masonry veneer type | PavedDrive | Paved driveway |
| MasVnrArea | Masonry veneer area in square feet | WoodDeckSF | Wood deck area in square feet |
| ExterQual | Exterior material quality | OpenPorchSF | Open porch area in square feet |
| ExterCond | Present condition of the material on the exterior | EnclosedPorch | Enclosed porch area in square feet |
| Foundation | Type of foundation | 3SsnPorch | Three season porch area in square feet |
| BsmtQual | Height of the basement | ScreenPorch | Screen porch area in square feet |
| BsmtCond | General condition of the basement | PoolArea | Pool area in square feet |
| BsmtExposure | Walkout or garden level basement walls | PoolQC | Pool quality |
| BsmtFinType1 | Quality of basement finished area | Fence | Fence quality |
| BsmtFinSF1 | Type 1 finished square feet | MiscFeature | Miscellaneous feature not covered in other categories |
| BsmtFinType2 | Quality of second finished area (if present) | MiscVal | Value of miscellaneous feature |
| BsmtFinSF2 | Type 2 finished square feet | MoSold | Month Sold |
| BsmtUnfSF | Unfinished square feet of basement area | YrSold | Year Sold |
| TotalBsmtSF | Total square feet of basement area | SaleType | Type of sale |
| Heating | Type of heating | SaleCondition | Condition of sale |
PATH = 'house-prices-advanced-regression-techniques'
Data = pd.read_csv(PATH+'/House_Prices.csv', keep_default_na=False)
Target = 'SalePrice'
Train = Data.loc[Data.Set == 'Train'].drop(columns = ['Set'])
Train['SalePrice'] = Train['SalePrice'].astype(float)
Test = Data.loc[Data.Set == 'Test'].drop(columns = ['Set', Target])
def Search_List(Key, List): return [s for s in List if Key in s]
def Data_Plot(Inp, W = False):
data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
data_info = data_info.join(Temp, how='outer')
data_info ['Size'] = Inp.shape[0]
data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
data_info.index.name = 'Features'
data_info = data_info.reset_index(drop = False)
#
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type', text = 'Data Type',
color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1))
fig.update_traces(texttemplate= 6*' ' + '%{label}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
if W:
fig.update_layout(width = W)
fig.show()
return data_info
data_info = Data_Plot(Data).sort_values('Features', ascending=True)
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(16, 12))
ax = ax.ravel()
# Top
_ = sns.distplot(Train['SalePrice'], color= 'Indigo' , norm_hist=True, ax = ax[0])
_ = ax[0].set_ylim([0, 1e-5])
_ = ax[0].set_xlabel('Sale Price')
_ = ax[0].set_title('Distribution of Observations')
_ = stats.probplot(Train['SalePrice'], plot= ax[1])
_ = ax[1].set_ylim([0, 8e5])
_ = ax[1].set_title('Probability Plot: Sale Price')
# Bottom
_ = sns.distplot(np.log(Train['SalePrice']), color= 'DarkGreen' , norm_hist=True, ax = ax[2])
_ = ax[2].set_ylim([0, 1.2])
_ = ax[2].set_xlabel('log(Sale Price)')
_ = ax[2].set_title('Distribution of Observations')
_ = stats.probplot(np.log(Train['SalePrice']), plot= ax[3])
_ = ax[3].set_ylim([10, 14])
_ = ax[3].set_title('Probability Plot: log(Sale Price)')
Therefore,
Train['SalePrice'] = np.log(Train['SalePrice'])
Let's take a look at the variance of the features.
Fig, ax = plt.subplots(figsize=(17,16))
Temp = Train.drop(columns = [Target, 'ID']).var().sort_values(ascending = False).to_frame(name= 'Variance').round(2).T
_ = sns.heatmap(Temp, ax=ax, annot=True, square=True, cmap =sns.color_palette("OrRd", 20),
linewidths = 0.8, vmin=0, vmax=Temp.max(axis =1)[0], annot_kws={"size": 8},
cbar_kws={'label': 'Feature Variance', "aspect":40, "shrink": .4, "orientation": "horizontal"})
_ = ax.set_yticklabels('')
High variance for some features can have a negative impact on our modeling process. For this reason, we would like to standardize features by removing the mean and scaling to unit variance. For this reason, we take a unique approach in preparing the data for end-to-end modeling. In doing so, we can separate numeric and categorical features and treat each group separately.
X = Train.drop(columns = [Target, 'ID'])
y = Train[Target]
Temp = X.dtypes.astype(str).to_frame(name='DataType').sort_values(by = 'DataType')
def Header(Text, L = 100, C1 = Back.BLUE, C2 = Fore.BLUE):
print(C1 + Fore.WHITE + Style.NORMAL + Text + Style.RESET_ALL + ' ' + C2 +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = Fore.BLUE): print(C + Style.NORMAL + L*'=' + Style.RESET_ALL)
Numeric_Columns = Temp.loc[Temp['DataType'] != 'object'].index.tolist()
Categorical_Columns = Temp.loc[Temp['DataType'] == 'object'].index.tolist()
del Temp
Header('Numeric Columns')
print(', '.join(Numeric_Columns))
Line()
Header('Categorical Columns')
print(', '.join(Categorical_Columns))
Line()
In this section, we test a number of efficient scikit-learn regressors. Then from those that have performed well, a stacked model can be formed. In particular, we use the following models:
| Regressor | Link |
|---|---|
| Bagging Regressorr | sklearn.ensemble.BaggingRegressor |
| Decision Tree Regressor | sklearn.tree.DecisionTreeRegressor |
| Gradient Boosting Regressor | sklearn.ensemble.GradientBoostingRegressor |
| MLP Regressor | sklearn.neural_network.MLPRegressor |
| Random Forest Regressor | sklearn.ensemble.RandomForestRegressor |
Regressors = {'Bagging Regressorr': BaggingRegressor(),
'Decision Tree Regression': DecisionTreeRegressor(),
'Gradient Boosted Tree': GradientBoostingRegressor(),
'MLP Regressor': MLPRegressor(),
'RandomForest Regressor': RandomForestRegressor()}
TestR2 = {}
Threshold = .8
flag = True
RegList = list(Regressors.keys())
N = len(RegList)
preprocessor = make_column_transformer((StandardScaler(), tuple(Numeric_Columns)),
(OneHotEncoder(handle_unknown='ignore') , tuple(Categorical_Columns)))
def ScoreTable(reg, RegName, X = X, y = y):
score = cross_validate(reg, X, y, scoring=['r2', 'neg_mean_absolute_error','neg_mean_squared_error'],
return_train_score = True, n_jobs=-1, verbose=0)
score['test_neg_mean_absolute_error'] = -score['test_neg_mean_absolute_error']
score['train_neg_mean_absolute_error'] = -score['train_neg_mean_absolute_error']
Scores = pd.DataFrame(score).mean(axis = 0).map(lambda x: ('%.2f' % x)) + ' ± ' + \
pd.DataFrame(score).std(axis = 0).map(lambda x: ('%.2e' % x))
Scores = Scores.to_frame(RegName)
Temp = [x.replace('_',' ').title().replace('Train','Train:').replace('Test','Test:').replace('Neg','')\
for x in Scores.index]
Scores.index = Temp
return score, Scores
def plot_regression_results(reg, RegName, ax = False, X=X, y=y):
y_pred = cross_val_predict(reg, X, y, n_jobs=-1, verbose=0)
if not ax:
fig, ax = plt.subplots(figsize=(6,6))
_ = ax.scatter(y, y_pred, facecolors='SkyBlue', edgecolors='MidnightBlue', alpha = 0.8)
Min = np.floor(np.stack((y.values,y_pred), axis = 0).min())
Max = np.ceil(np.stack((y.values,y_pred), axis = 0).max())
_ = ax.set_xlim(left = Min, right = Max)
_ = ax.set_ylim(bottom = Min, top = Max)
del Min, Max
_ = ax.plot([ax.get_xlim()[0], ax.get_xlim()[1]], [ax.get_ylim()[0], ax.get_ylim()[1]], '--r', linewidth=2)
_ = ax.set_xlabel('Measured log(Sale Price)', fontsize=14)
_ = ax.set_ylabel('Predicted log(Sale Price)', fontsize=14)
_ = ax.set_title(RegName, fontsize = 18)
_ = ax.set(aspect='equal')
# Plot
rows = int(np.ceil(N/2))
fig, axes = plt.subplots(nrows= rows, ncols=2, figsize=(14, rows*7))
axes = axes.ravel()
if len(axes)>N:
_ = fig.delaxes(axes[-1])
# Progressbar
Progress_Bar = progressbar.ProgressBar(maxval = N, widgets=[progressbar.Bar('=', '|', '|'), progressbar.Percentage()])
Progress_Bar.start()
for Counter in range(N):
#
Progress_Bar.update(Counter)
RegName = RegList[Counter]
reg = make_pipeline(preprocessor, Regressors[RegName])
_ = reg.fit(X, y)
score, Temp = ScoreTable(reg, RegName)
if flag:
Scores = Temp.copy()
else:
Scores = pd.concat([Scores,Temp], axis = 1)
flag = False
#
if score['test_r2'].mean() >= Threshold:
TestR2[RegName] = score['test_r2'].mean()
ax = axes[Counter]
plot_regression_results(reg, RegName, ax)
Progress_Bar.finish()
del ax, axes
Moreover, the following table highlights the performance of the regressors.
display(Scores)
Final_Est = np.array(list(TestR2.values()))
Final_Est = np.where (Final_Est == Final_Est.max())[0][0]
Final_Est = list(TestR2.keys())[Final_Est]
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'Best Performing Model:' + Style.RESET_ALL + ' ' + Final_Est)
model = make_pipeline(preprocessor, Regressors[Final_Est])
_ = model.fit(X, y)
_, Score = ScoreTable(model, Final_Est)
display(Score)
plot_regression_results(model, Final_Est)
Pred = model.predict(Test.drop(columns = 'ID'))
Pred = np.exp(Pred)
# Pred = np.round(Pred,0)
Predictions = Test.copy()
Predictions[Target] = Pred
Predictions = Predictions[['ID', 'SalePrice']]
display(Predictions.reset_index(drop = True))
Predictions.columns = ['Id', 'SalePrice']
Predictions.to_csv(PATH + '/Submission.csv', index=False)